train <- read.csv(file="train.csv", na.strings=c(""))
test <- read.csv(file="test.csv", na.strings=c(""))
summary(train)
Dates Category Descript
2011-01-01 00:01:00: 185 LARCENY/THEFT :174900 GRAND THEFT FROM LOCKED AUTO : 60022
2006-01-01 00:01:00: 136 OTHER OFFENSES:126182 LOST PROPERTY : 31729
2012-01-01 00:01:00: 94 NON-CRIMINAL : 92304 BATTERY : 27441
2006-01-01 12:00:00: 63 ASSAULT : 76876 STOLEN AUTOMOBILE : 26897
2007-06-01 00:01:00: 61 DRUG/NARCOTIC : 53971 DRIVERS LICENSE, SUSPENDED OR REVOKED: 26839
2006-06-01 00:01:00: 58 VEHICLE THEFT : 53781 WARRANT ARREST : 23754
(Other) :877452 (Other) :300035 (Other) :681367
DayOfWeek PdDistrict Resolution Address
Friday :133734 SOUTHERN :157182 NONE :526790 800 Block of BRYANT ST : 26533
Monday :121584 MISSION :119908 ARREST, BOOKED :206403 800 Block of MARKET ST : 6581
Saturday :126810 NORTHERN :105296 ARREST, CITED : 77004 2000 Block of MISSION ST: 5097
Sunday :116707 BAYVIEW : 89431 LOCATED : 17101 1000 Block of POTRERO AV: 4063
Thursday :125038 CENTRAL : 85460 PSYCHOPATHIC CASE: 14534 900 Block of MARKET ST : 3251
Tuesday :124965 TENDERLOIN: 81809 UNFOUNDED : 9585 0 Block of TURK ST : 3228
Wednesday:129211 (Other) :238963 (Other) : 26632 (Other) :829296
X Y
Min. :-122.5 Min. :37.71
1st Qu.:-122.4 1st Qu.:37.75
Median :-122.4 Median :37.78
Mean :-122.4 Mean :37.77
3rd Qu.:-122.4 3rd Qu.:37.78
Max. :-120.5 Max. :90.00
library(Amelia)
Loading required package: Rcpp
##
## Amelia II: Multiple Imputation
## (Version 1.7.4, built: 2015-12-05)
## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
##
missmap(train, main = "Missing values vs observed")
It seems that here are no missing values. Great!
# Overall structure
str(train)
'data.frame': 878049 obs. of 9 variables:
$ Dates : Factor w/ 389257 levels "2003-01-06 00:01:00",..: 389257 389257 389256 389255 389255 389255 389255 389255 389254 389254 ...
$ Category : Factor w/ 39 levels "ARSON","ASSAULT",..: 38 22 22 17 17 17 37 37 17 17 ...
$ Descript : Factor w/ 879 levels "ABANDONMENT OF CHILD",..: 867 811 811 405 405 407 740 740 405 405 ...
$ DayOfWeek : Factor w/ 7 levels "Friday","Monday",..: 7 7 7 7 7 7 7 7 7 7 ...
$ PdDistrict: Factor w/ 10 levels "BAYVIEW","CENTRAL",..: 5 5 5 5 6 3 3 1 7 2 ...
$ Resolution: Factor w/ 17 levels "ARREST, BOOKED",..: 1 1 1 12 12 12 12 12 12 12 ...
$ Address : Factor w/ 23228 levels "0 Block of HARRISON ST",..: 19791 19791 22698 4267 1844 1506 13323 18055 11385 17659 ...
$ X : num -122 -122 -122 -122 -122 ...
$ Y : num 37.8 37.8 37.8 37.8 37.8 ...
# Get to know data types
sapply(train, class)
Dates Category Descript DayOfWeek PdDistrict Resolution Address X Y
"factor" "factor" "factor" "factor" "factor" "factor" "factor" "numeric" "numeric"
# summarize the class distribution
cat_percentage <- prop.table(table(train$Category)) * 100
cbind(freq=table(train$Category), percentage=cat_percentage)
freq percentage
ARSON 1513 1.723138e-01
ASSAULT 76876 8.755320e+00
BAD CHECKS 406 4.623888e-02
BRIBERY 289 3.291388e-02
BURGLARY 36755 4.185985e+00
DISORDERLY CONDUCT 4320 4.919999e-01
DRIVING UNDER THE INFLUENCE 2268 2.582999e-01
DRUG/NARCOTIC 53971 6.146696e+00
DRUNKENNESS 4280 4.874443e-01
EMBEZZLEMENT 1166 1.327944e-01
EXTORTION 256 2.915555e-02
FAMILY OFFENSES 491 5.591943e-02
FORGERY/COUNTERFEITING 10609 1.208247e+00
FRAUD 16679 1.899552e+00
GAMBLING 146 1.662777e-02
KIDNAPPING 2341 2.666138e-01
LARCENY/THEFT 174900 1.991916e+01
LIQUOR LAWS 1903 2.167305e-01
LOITERING 1225 1.395139e-01
MISSING PERSON 25989 2.959858e+00
NON-CRIMINAL 92304 1.051240e+01
OTHER OFFENSES 126182 1.437072e+01
PORNOGRAPHY/OBSCENE MAT 22 2.505555e-03
PROSTITUTION 7484 8.523442e-01
RECOVERED VEHICLE 3138 3.573832e-01
ROBBERY 23000 2.619444e+00
RUNAWAY 1946 2.216277e-01
SECONDARY CODES 9985 1.137180e+00
SEX OFFENSES FORCIBLE 4388 4.997443e-01
SEX OFFENSES NON FORCIBLE 148 1.685555e-02
STOLEN PROPERTY 4540 5.170554e-01
SUICIDE 508 5.785554e-02
SUSPICIOUS OCC 31414 3.577705e+00
TREA 6 6.833332e-04
TRESPASS 7326 8.343498e-01
VANDALISM 44725 5.093679e+00
VEHICLE THEFT 53781 6.125057e+00
WARRANTS 42214 4.807704e+00
WEAPON LAWS 8555 9.743192e-01
# Get top crimes
crime_categories_df <- as.data.frame(table(train$Category))
crime_categories_df[with(crime_categories_df, order(-Freq)),]
top_crimes <- head(crime_categories_df[with(crime_categories_df, order(-Freq)),], n=10)
# Create data for the graph.
x <- top_crimes$Freq
labels <- top_crimes$Var1
piepercent <- round(100*x/sum(x), 1)
# Plot the chart.
pie(x, labels = piepercent, main = "Top 10 Crimes",col = rainbow(length(x)))
legend("right", as.character(labels), cex = 0.8,
fill = rainbow(length(x)))
We can see that larceny/theft and non-criminal takes up much of the pie, followed by non-criminal and assult. ‘Other offenses’ also accounts for a large proportion, but it contains ambiguities and lacks information.
Is there a day of week that has significantly more crimes than other days? The distribution is rather even. But Friday is surely a peak (maybe people consume more after a week’s work) while Sunday is a slump (most people stay at home).
library(ggplot2)
Attaching package: 'ggplot2'
The following object is masked from 'package:NLP':
annotate
table(train$Category ,train$DayOfWeek)
Friday Monday Saturday Sunday Thursday Tuesday Wednesday
ARSON 220 228 220 211 199 235 200
ASSAULT 11160 10560 11995 12082 10246 10280 10553
BAD CHECKS 62 66 45 20 66 76 71
BRIBERY 49 41 42 41 39 37 40
BURGLARY 6327 5262 4754 4231 5350 5374 5457
DISORDERLY CONDUCT 541 608 624 586 644 657 660
DRIVING UNDER THE INFLUENCE 352 263 457 442 282 251 221
DRUG/NARCOTIC 7420 7823 6390 6143 8454 8474 9267
DRUNKENNESS 622 513 833 813 496 461 542
EMBEZZLEMENT 211 222 137 108 165 156 167
EXTORTION 35 30 32 39 40 39 41
FAMILY OFFENSES 82 69 59 54 63 85 79
FORGERY/COUNTERFEITING 1757 1704 1178 901 1610 1752 1707
FRAUD 2641 2533 2256 1874 2351 2506 2518
GAMBLING 35 16 21 12 20 12 30
KIDNAPPING 385 340 355 374 289 306 292
LARCENY/THEFT 27104 23570 27217 24150 24415 23957 24487
LIQUOR LAWS 291 188 297 222 248 323 334
LOITERING 139 193 140 155 186 252 160
MISSING PERSON 4663 3592 3752 3061 3680 3655 3586
NON-CRIMINAL 13984 12855 14007 12973 12819 12738 12928
OTHER OFFENSES 18588 17787 17129 15457 18462 18809 19950
PORNOGRAPHY/OBSCENE MAT 4 3 1 3 5 3 3
PROSTITUTION 1158 409 850 620 1547 1421 1479
RECOVERED VEHICLE 494 530 343 307 432 517 515
ROBBERY 3384 3194 3428 3284 3216 3221 3273
RUNAWAY 344 280 268 205 305 275 269
SECONDARY CODES 1392 1483 1462 1543 1389 1343 1373
SEX OFFENSES FORCIBLE 621 607 662 690 585 597 626
SEX OFFENSES NON FORCIBLE 28 23 21 16 15 23 22
STOLEN PROPERTY 647 636 581 583 679 714 700
SUICIDE 72 75 73 67 89 66 66
SUSPICIOUS OCC 4924 4447 4155 4010 4510 4517 4851
TREA 1 1 2 0 1 1 0
TRESPASS 1064 1081 983 915 1047 1114 1122
VANDALISM 7092 5946 7326 6602 5980 5852 5927
VEHICLE THEFT 8613 7412 8119 7504 7456 7263 7414
WARRANTS 5926 5811 5364 5281 6376 6427 7029
WEAPON LAWS 1302 1183 1232 1128 1282 1176 1252
g <- ggplot(train, aes(DayOfWeek))
g + geom_bar(aes(fill = Category)) + theme(legend.position="bottom")
How does criminal activities change over the years? Does it increase or decrease or stay the same?
train$Year <- substring(train$Dates, 1, 4)
train$Month <- substring(train$Dates, 6, 7)
crime_history <- head(as.vector(table(train$Month,train$Year)), -12)
crime_history
[1] 5831 5964 6099 6758 7025 6052 5503 5800 6704 7259 6194 4713 5938 5626 7262 6988 6865 5614 5679 6439 6361
[22] 6695 5011 4944 5669 5252 5448 5586 6426 6134 6512 5428 5426 6292 6422 6184 5896 5537 5418 5524 6177 6393
[43] 6246 5523 5312 6183 5868 5832 5094 5093 5209 5336 6253 5984 5894 5331 5509 6733 6253 5326 5182 5284 5974
[64] 6028 6597 5556 5631 5275 6367 7173 6371 4736 5272 5237 6580 6472 6355 4543 4960 6199 6671 6593 5581 4537
[85] 5179 5063 4997 4890 5708 5888 6207 5758 5453 5395 5906 6098 6130 5029 5071 5123 5742 5915 5895 5056 5278
[106] 5410 5761 6209 5987 5367 5341 5618 6563 6024 5692 5481 5585 7497 6584 5992 5712 5694 5830 6615 6924 6797
[127] 5944 6103 6649 7741 6553 5044 5780 5659 6240 6549 6759 5992 5808 6147 6667 7303 6471 5391
crime_ts <- ts(crime_history, frequency=12, start=c(2003,1))
crime_ts
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2003 5831 5964 6099 6758 7025 6052 5503 5800 6704 7259 6194 4713
2004 5938 5626 7262 6988 6865 5614 5679 6439 6361 6695 5011 4944
2005 5669 5252 5448 5586 6426 6134 6512 5428 5426 6292 6422 6184
2006 5896 5537 5418 5524 6177 6393 6246 5523 5312 6183 5868 5832
2007 5094 5093 5209 5336 6253 5984 5894 5331 5509 6733 6253 5326
2008 5182 5284 5974 6028 6597 5556 5631 5275 6367 7173 6371 4736
2009 5272 5237 6580 6472 6355 4543 4960 6199 6671 6593 5581 4537
2010 5179 5063 4997 4890 5708 5888 6207 5758 5453 5395 5906 6098
2011 6130 5029 5071 5123 5742 5915 5895 5056 5278 5410 5761 6209
2012 5987 5367 5341 5618 6563 6024 5692 5481 5585 7497 6584 5992
2013 5712 5694 5830 6615 6924 6797 5944 6103 6649 7741 6553 5044
2014 5780 5659 6240 6549 6759 5992 5808 6147 6667 7303 6471 5391
plot.ts(crime_ts)
We can see that the basic trend is declining from 2004 to 2010. Then, crime rate begins to rise until 2014. But noticeably we can clearly observe the seasonality throughout the years. So it’s worthwhile to investigate the fluctuation over the months. Maybe some analysis over time-in-a-day would be helpful too. For now let’s just decompose the data.
crime_components <- decompose(crime_ts)
plot(crime_components)
It seems the trend is just what I described, roughly. The seasonal component seems really interesting.
train_incomplete <- subset(train, Year != 2015)
tb <- table(train_incomplete$Month, train_incomplete$Category)
df <- data.frame(month=as.integer(row.names(tb)), crime_freq=as.vector(tb), crime_categories=rep(colnames(tb), each=length(row.names(tb))))
# plot
ggplot(data = df, aes(x=month, y=crime_freq)) + geom_line(aes(colour=crime_categories)) + theme(legend.position="left")
# Create the data for the chart.
tb <- table(train_incomplete$Month, train_incomplete$Category)
v = rowSums(tb)
# Plot the bar chart.
plot(v,type = "o", col = "red", xlab = "Month", ylab = "Crime Frequency",
main = "Monthly Crime")
We can see that, usually December and Feburary has the lowest crime rate (perhaps people feel too cold to leave home). June, July, August have low frequency as well. Crime activities peak in May and October. This pattern is observed by all major categories of crime. However, the data of December is significantly lower than the others. Maybe it’s because of the lack of data in 2015. I’ll get rid of the data of 2015 when necessary and adjust the previous results.
Just an example of mapping SF.
library(ggplot2)
library(ggmap)
Google Maps API Terms of Service: http://developers.google.com/maps/terms.
Please cite ggmap if you use it: see citation('ggmap') for details.
library(maptools)
Loading required package: sp
Checking rgeos availability: TRUE
library(ggthemes)
library(rgeos)
rgeos version: 0.3-23, (SVN revision 546)
GEOS runtime version: 3.6.1-CAPI-1.10.1 r0
Linking to sp version: 1.2-4
Polygon checking: TRUE
library(broom)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:rgeos':
intersect, setdiff, union
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(plyr)
---------------------------------------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
---------------------------------------------------------------------------------------------------------------
Attaching package: 'plyr'
The following objects are masked from 'package:dplyr':
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
library(grid)
library(gridExtra)
Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':
combine
library(reshape2)
library(scales)
plotTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 18,colour = "black"),
plot.subtitle = element_text(face="italic"),
plot.caption = element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=12),
axis.title = element_text(size=8),
axis.text = element_text(size=8),
axis.title.x = element_text(hjust=1),
axis.title.y = element_text(hjust=1),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"))
}
# And another that we will use for maps
mapTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 18,colour = "black"),
plot.subtitle=element_text(face="italic"),
plot.caption=element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
strip.text = element_text(size=12),
axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "grey80", color = "white"),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"))
}
# Define some palettes
palette_9_colors <- c("#0DA3A0","#2999A9","#458FB2","#6285BB","#7E7CC4","#9A72CD","#B768D6","#D35EDF","#F055E9")
palette_8_colors <- c("#0DA3A0","#2D97AA","#4D8CB4","#6E81BF","#8E76C9","#AF6BD4","#CF60DE","#F055E9")
palette_7_colors <- c("#2D97AA","#4D8CB4","#6E81BF","#8E76C9","#AF6BD4","#CF60DE","#F055E9")
palette_1_colors <- c("#0DA3A0")
# Read in a csv of home sale transactions directly from github.
sf <- read.csv("https://raw.githubusercontent.com/simonkassel/Visualizing_SF_home_prices_R/master/Data/SF_home_sales_demo_data.csv")
# We will need to consider Sale Year as a categorical variable so we convert it from a numeric variable to a factor
sf$SaleYr <- as.factor(sf$SaleYr)
# Define the URL of the zipped shapefile
URL <- "https://github.com/simonkassel/Visualizing_SF_home_prices_R/raw/master/Data/SF_neighborhoods.zip"
# Download the shapefile to your working directory and unzip it.
download.file(URL, "SF_neighborhoods.zip")
trying URL 'https://github.com/simonkassel/Visualizing_SF_home_prices_R/raw/master/Data/SF_neighborhoods.zip'
Content type 'application/zip' length 141938 bytes (138 KB)
==================================================
downloaded 138 KB
unzip("SF_neighborhoods.zip")
# Read it into R as a spatial polygons data frame & plot
neighb <- readShapePoly("SF_neighborhoods")
use rgdal::readOGR or sf::st_read
plot(neighb)
# Define the bounding box
bbox <- neighb@bbox
# Manipulate these values slightly so that we get some padding on our basemap between the edge of the data and the edge of the map
sf_bbox <- c(left = bbox[1, 1] - .01, bottom = bbox[2, 1] - .005,
right = bbox[1, 2] + .01, top = bbox[2, 2] + .005)
# Download the basemap
basemap <- get_stamenmap(
bbox = sf_bbox,
zoom = 13,
maptype = "toner-lite")
Map from URL : http://tile.stamen.com/toner-lite/13/1307/3165.png
Map from URL : http://tile.stamen.com/toner-lite/13/1308/3165.png
Map from URL : http://tile.stamen.com/toner-lite/13/1309/3165.png
Map from URL : http://tile.stamen.com/toner-lite/13/1310/3165.png
Map from URL : http://tile.stamen.com/toner-lite/13/1311/3165.png
Map from URL : http://tile.stamen.com/toner-lite/13/1307/3166.png
Map from URL : http://tile.stamen.com/toner-lite/13/1308/3166.png
Map from URL : http://tile.stamen.com/toner-lite/13/1309/3166.png
Map from URL : http://tile.stamen.com/toner-lite/13/1310/3166.png
Map from URL : http://tile.stamen.com/toner-lite/13/1311/3166.png
Map from URL : http://tile.stamen.com/toner-lite/13/1307/3167.png
Map from URL : http://tile.stamen.com/toner-lite/13/1308/3167.png
Map from URL : http://tile.stamen.com/toner-lite/13/1309/3167.png
Map from URL : http://tile.stamen.com/toner-lite/13/1310/3167.png
Map from URL : http://tile.stamen.com/toner-lite/13/1311/3167.png
Map from URL : http://tile.stamen.com/toner-lite/13/1307/3168.png
Map from URL : http://tile.stamen.com/toner-lite/13/1308/3168.png
Map from URL : http://tile.stamen.com/toner-lite/13/1309/3168.png
Map from URL : http://tile.stamen.com/toner-lite/13/1310/3168.png
Map from URL : http://tile.stamen.com/toner-lite/13/1311/3168.png
# # Map it
# bmMap <- ggmap(basemap) + mapTheme() +
# labs(title="San Francisco basemap")
# bmMap
# Define the bounding box
bbox <- neighb@bbox
# Manipulate these values slightly so that we get some padding on our basemap between the edge of the data and the edge of the map
sf_bbox <- c(left = bbox[1, 1] - .01, bottom = bbox[2, 1] - .005,
right = bbox[1, 2] + .01, top = bbox[2, 2] + .005)
# Download the basemap
basemap <- get_stamenmap(
bbox = sf_bbox,
zoom = 13,
maptype = "toner-lite")
# # Map it
# bmMap <- ggmap(basemap) + mapTheme() +
# labs(title="San Francisco basemap")
# bmMap
#
# prices_mapped_by_year <- ggmap(basemap) +
# geom_point(data = sf, aes(x = long, y = lat, color = SalePrice),
# size = .25, alpha = 0.6) +
# facet_wrap(~SaleYr, scales = "fixed", ncol = 4) +
# coord_map() +
# mapTheme() + theme(legend.position = c(.85, .25)) +
# scale_color_gradientn("Sale Price",
# colors = palette_8_colors,
# labels = scales::dollar_format(prefix = "$")) +
# labs(title="Distribution of San Francisco home prices",
# subtitle="Nominal prices (2009 - 2015)",
# caption="Source: San Francisco Office of the Assessor-Recorder\n@KenSteif & @SimonKassel")
# prices_mapped_by_year
train[, c("X", "Y", "Year", "Category")]
crime_location <- data.frame( train[, c("X", "Y", "Year", "Category")] )
crime_location
# Manipulate these values slightly so that we get some padding on our basemap between the edge of the data and the edge of the map
sf_bbox <- c(left = bbox[1, 1] - .01, bottom = bbox[2, 1] - .005,
right = bbox[1, 2] + .01, top = bbox[2, 2] + .005)
# Download the basemap
basemap <- get_stamenmap(
bbox = sf_bbox,
zoom = 13,
maptype = "toner-lite")
# Map it
bmMap <- ggmap(basemap) + mapTheme() +
labs(title="San Francisco Crime Map")
bmMap + geom_point(data=crime_location, aes(x=X, y=Y, color=Category), size=0.7, alpha=0.3) + theme(legend.position = "right")
top_crime_map <- crime_location[crime_location$Category %in% as.vector(top_crimes$Var1),]
bmMapTop <- ggmap(basemap) + mapTheme() +
labs(title="San Francisco Top Crime Map")
bmMapTop + geom_point(data=top_crime_map, aes(x=X, y=Y, color=Category), size=0.7, alpha=0.3) + theme(legend.position = "right")
Although this map is beautiful, it provides us with too much information to be insightful. To get more out of this visualisation, we need to limit the categories to those most ‘popular’ crimes, or we need to regroup the crime categories.
# Map it
bmMap <- ggmap(basemap) + mapTheme() +
labs(title="San Francisco basemap")
prices_mapped_by_year <- ggmap(basemap) +
geom_point(data = top_crime_map, aes(x = X, y = Y, color = Category),
size = .25, alpha = 0.6) +
facet_wrap(~Year, scales = "fixed", ncol = 4) +
coord_map() +
mapTheme() + theme(legend.position = "right") +
labs(title="Top 10 Crimes in San Francisco",
subtitle="2003 - 2015")
prices_mapped_by_year
Ok anyways… Thanks to Kelvin, I noticed there is a very strong correlation between the Descrition column and the Category column. Some text mining is needed though.
#train$Descript
library(tm)
library(wordcloud)
descript <- removeNumbers(removePunctuation(tolower(as.vector(train$Descript))))
descript <- removeWords(descript, stopwords("en"))
descript_corpus <- Corpus(VectorSource(train$Descript))
descript_corpus = tm_map(descript_corpus, content_transformer(tolower))
descript_corpus = tm_map(descript_corpus, removeNumbers)
descript_corpus = tm_map(descript_corpus, removePunctuation)
descript_corpus = tm_map(descript_corpus, removeWords, c("the", "and"))
descript_corpus = tm_map(descript_corpus, stripWhitespace)
descript_dtm <- DocumentTermMatrix(descript_corpus)
descript_dtm <- removeSparseTerms(descript_dtm, 0.975)
findFreqTerms(descript_dtm, 100)
[1] "arrest" "warrant" "traffic" "violation" "auto" "from" "grand" "locked"
[9] "theft" "automobile" "stolen" "petty" "malicious" "mischief" "vandalism" "property"
[17] "robbery" "with" "lost" "vehicle" "suspicious" "aided" "case" "drivers"
[25] "license" "revoked" "suspended" "burglary" "entry" "possession" "battery" "occurrence"
raw_freq = data.frame(sort(colSums(as.matrix(descript_dtm)), decreasing=TRUE))
raw_freq
dim(raw_freq)
[1] 32 1
freq_words <- rownames(raw_freq)
freq_words
[1] "theft" "from" "grand" "auto" "property" "locked" "petty" "possession"
[9] "stolen" "violation" "malicious" "mischief" "arrest" "with" "license" "entry"
[17] "vandalism" "battery" "lost" "burglary" "case" "aided" "vehicle" "drivers"
[25] "automobile" "revoked" "suspended" "suspicious" "warrant" "robbery" "occurrence" "traffic"
wordcloud(rownames(raw_freq), raw_freq[,1], max.words=100, colors=brewer.pal(1, "Dark2"))
minimal value for n is 3, returning requested palette with 3 different levels
descript_dtm_tfidf <- DocumentTermMatrix(descript_corpus, control = list(weighting = weightTfIdf))
descript_dtm_tfidf = removeSparseTerms(descript_dtm_tfidf, 0.975)
freq = data.frame(sort(colSums(as.matrix(descript_dtm_tfidf)), decreasing=TRUE))
freq
freq_words <- c(freq_words, rownames(freq))
freq_words <- unique(freq_words)
freq_words
[1] "theft" "from" "grand" "auto" "property" "locked" "petty" "possession"
[9] "stolen" "violation" "malicious" "mischief" "arrest" "with" "license" "entry"
[17] "vandalism" "battery" "lost" "burglary" "case" "aided" "vehicle" "drivers"
[25] "automobile" "revoked" "suspended" "suspicious" "warrant" "robbery" "occurrence" "traffic"
wordcloud(rownames(freq), freq[,1], max.words=100, colors=brewer.pal(1, "Dark2"))
minimal value for n is 3, returning requested palette with 3 different levels
Ok, let’s try to search for some keywords in the descript column that matches the category column.
unique_cat <- unique(train$Category)
x <- ""
for(cat in unique(train$Category)) {
x <- paste(x, cat, sep="|")
}
x <- tolower(substring(x,2))
match_count_table <- table(grepl(x, tolower(train$Descript)))
match_count_table
FALSE TRUE
704738 173311
prop.table(match_count_table)
FALSE TRUE
0.8026181 0.1973819
So about 20% of the DESCRIPT contains the CATEGORY keywords. There is a rather strong correlation indeed. This is definitely going to be a feature. How about the holidays? Let’s get some data about the public holiday in San Francisco!!!
regular_day <- train
train$Holiday <- "Regular"
# Holidays
new_year <- regular_day[grepl("[0-9]{4}-01-01", regular_day$Dates),]
train$Holiday[grepl("[0-9]{4}-01-01", train$Dates)] <- "NewYear"
regular_day <- regular_day[!grepl("[0-9]{4}-01-01", regular_day$Dates),]
#Valentine
valentine <- regular_day[grepl("[0-9]{4}-02-14", regular_day$Dates),]
train$Holiday[grepl("[0-9]{4}-02-14", train$Dates)] <- "Valentine"
regular_day <- regular_day[!grepl("[0-9]{4}-02-14", regular_day$Dates),]
#MLK <- # Third Monday in January
#presidents_day <- # Third Monday in Febrary
#easter <- # Arr
#memorial_day <- # Last Monday in May
independence_day <- regular_day[grepl("[0-9]{4}-07-04", regular_day$Dates),]
train$Holiday[grepl("[0-9]{4}-07-04", train$Dates)] <- "Independence"
regular_day <- regular_day[!grepl("[0-9]{4}-07-04", regular_day$Dates),]
#labor_day <- # First Monday in September
#columbus_day <- # Second Monday in October
veterans_day <- regular_day[grepl("[0-9]{4}-11-11", regular_day$Dates),]
train$Holiday[grepl("[0-9]{4}-11-11", train$Dates)] <- "Veterans"
regular_day <- regular_day[!grepl("[0-9]{4}-11-11", regular_day$Dates),]
#thanks_giving <- # Fourth Thursday in November
christmas <- regular_day[grepl("[0-9]{4}-12-25", regular_day$Dates),]
train$Holiday[grepl("[0-9]{4}-12-25", train$Dates)] <- "Christmas"
regular_day <- regular_day[!grepl("[0-9]{4}-12-25", regular_day$Dates),]
library(ggplot2)
Attaching package: 'ggplot2'
The following object is masked from 'package:NLP':
annotate
new_year_top_crime <- new_year[new_year$Category %in% as.vector(top_crimes$Var1),]
g <- ggplot(new_year_top_crime, aes(Year))
g + geom_bar() + geom_bar(aes(fill=Category)) + ggtitle("New Year Crime")
ind_top_crime <- independence_day[independence_day$Category %in% as.vector(top_crimes$Var1),]
g <- ggplot(ind_top_crime, aes(Year))
g + geom_bar() + geom_bar(aes(fill=Category)) + ggtitle("Independence Day Crime")
veterans_top_crime <- veterans_day[veterans_day$Category %in% as.vector(top_crimes$Var1),]
g <- ggplot(veterans_top_crime, aes(Year))
g + geom_bar() + geom_bar(aes(fill=Category)) + ggtitle("Veterans Day Crime")
christmas_top_crime <- christmas[christmas$Category %in% as.vector(top_crimes$Var1),]
g <- ggplot(christmas_top_crime, aes(Year))
g + geom_bar() + geom_bar(aes(fill=Category)) + ggtitle("Christmas Crime")
Time to do some averaging…
library(matrixStats)
new_year_avg <- colMedians(table(new_year_top_crime$Year, droplevels(new_year_top_crime$Category)))
valentine_top_crime <- valentine[valentine$Category %in% as.vector(top_crimes$Var1),]
valentine_avg <- colMedians(table(valentine_top_crime$Year, droplevels(valentine_top_crime$Category)))
ind_day_avg <- colMedians(table(ind_top_crime$Year, droplevels(ind_top_crime$Category)))
veterans_avg <- colMedians(table(veterans_top_crime$Year, droplevels(veterans_top_crime$Category)))
christmas_avg <- colMedians(table(christmas_top_crime$Year, droplevels(christmas_top_crime$Category)))
reg_day_top_crime <- regular_day[regular_day$Category %in% as.vector(top_crimes$Var1),]
reg_day_top_crime$DateOnly <- substring(reg_day_top_crime$Dates, 1, 10)
#reg_day_top_crime$DateOnly
reg_day_avg <- colMedians(table(reg_day_top_crime$DateOnly, droplevels(reg_day_top_crime$Category)))
#reg_day_avg
holiday_comparison_df <- data.frame(NewYear=new_year_avg, Valentine = valentine_avg, Ind=ind_day_avg, Veterans=veterans_avg, Christmas=christmas_avg, Regular=reg_day_avg)
row.names(holiday_comparison_df) <- sort(top_crimes$Var1)
holiday_comparison_df
par(xpd=TRUE)
barplot(as.matrix(holiday_comparison_df), main="Crimes in Special Days", col=rainbow(nrow(holiday_comparison_df)), xlab="Special Days", bty='L')
legend("topright",
legend = sort(top_crimes$Var1),
fill = rainbow(nrow(holiday_comparison_df)), cex=0.4)
Let’s see how the plot varies throughout the 24 hours in a day:
crime_time_df <- data.frame(Time=as.POSIXct(substring(train$Dates,12), format="%H:%M:%S"), Category=train$Category)
#ggplot(data=crime_time_df, aes(x=crime_time_df$Time, y=)) + geom_point()
Let’s see if weekends have more crimes than weekdays.
library(ggplot2)
wkday <- train
wkday$Week <- "Weekday"
wkday[wkday$DayOfWeek == "Saturday" | wkday$DayOfWeek == "Sunday",]$Week <- "Weekend"
wkday_df <- (data.frame(Week=wkday$Week, Category=wkday$Category))
wkday_df
g <- ggplot(wkday_df, aes(Week))
g + geom_bar() + geom_bar(aes(fill=Category)) + ggtitle("Weekday vs. Weekend") + theme(axis.text.x = element_text(angle=90,hjust=1))
wkday_top_crime <- wkday
wk_table <- table(wkday_top_crime$Week)
#wkday_result <- data.frame(Weekday=table(wkday_top_crime$Category, wkday_top_crime$Week)[,1]/wk_table["Weekday"],
# Weekend=table(wkday_top_crime$Category, wkday_top_crime$Week)[,2]/wk_table["Weekend"])
wkday_result <- data.frame(Weekday=table(wkday_top_crime$Category, wkday_top_crime$Week)[,1]/5,
Weekend=table(wkday_top_crime$Category, wkday_top_crime$Week)[,2]/2)
wkday_result
g + theme(legend.position="right")
par(xpd=TRUE)
barplot(as.matrix(wkday_result), main="Weekdays vs. Weekends", col=rainbow(nrow(wkday_result)), xlab="Day of Week", bty='L')
legend("topright",
legend = sort(top_crimes$Var1),
fill = rainbow(nrow(wkday_result)), cex=0.4)
It seems that whether a day is a weekday or a weekend doesn’t affect both the category and the quantity of crimes…So criminals doesn’t have day-offs! SAD! Umm common sense tells me that more crimes take place at night than during the day. Let’s divide the time into day and night!
library(chron)
train$Time <- times(substring(train$Dates,12))
dayNight <- data.frame(Times = times(substring(train$Dates,12)), Cat = train$Category)
breaks <- c(0,6,10,14,18,24)/24
labels <- c("EarlyMorning","Morning","Noon","Afternoon","Evening")
dayNight$ind <- cut(dayNight$Times, breaks, labels, include.lowest = TRUE)
train$TimeInDay <- cut(train$Time, breaks, labels, include.lowest = T)
dayNight
g <- ggplot(dayNight, aes(ind))
g + geom_bar() + geom_bar(aes(fill=Cat)) + ggtitle("Crime in a day") + theme(axis.text.x = element_text(angle=90,hjust=1))
dayNight <- data.frame(Times = times(substring(train$Dates,12)), Cat = train$Category)
breaks <- c(0,5, 20, 24)/24
labels <- c("Night","Day","Night2")
dayNight$ind <- cut(dayNight$Times, breaks, labels, include.lowest = TRUE)
train$DayNight <- cut(train$Time, breaks, labels, include.lowest = T)
dayNight$ind <- gsub("Night2", "Night", dayNight$ind)
train$DayNight <- gsub("Night2", "Night", train$DayNight)
g <- ggplot(dayNight, aes(ind))
g + geom_bar() + geom_bar(aes(fill=Cat)) + ggtitle("Crime in a day") + theme(axis.text.x = element_text(angle=90,hjust=1))
Also, maybe crimes are correlated with seasons? Let’s check it out! But again, the incompleteness of the data causes us a lot of trouble and might lead to inaccuracies, so some sort of averaging is needed.
seasons <- train
# March, April, May <=> Spring
seasons$Season <- "Spring"
train$Season <- "Spring"
# June, July, August <=> Summer
seasons[seasons$Month == "06" | seasons$Month == "07" | seasons$Month == "08",]$Season <- "Summer"
train[train$Month == "06" | train$Month == "07" | train$Month == "08",]$Season <- "Summer"
# September, October, November <=> Fall
seasons[seasons$Month == "09" | seasons$Month == "10" | seasons$Month == "11",]$Season <- "Fall"
train[train$Month == "09" | train$Month == "10" | train$Month == "11",]$Season <- "Fall"
# December, January, February <=> Winter
seasons[seasons$Month == "12" | seasons$Month == "01" | seasons$Month == "02",]$Season <- "Winter"
train[train$Month == "12" | train$Month == "01" | train$Month == "02",]$Season <- "Winter"
season_df <- (data.frame(Season=seasons$Season, Category=seasons$Category))
g <- ggplot(season_df, aes(Season))
g + geom_bar() + geom_bar(aes(fill=Category)) + ggtitle("Crime by Season") + theme(axis.text.x = element_text(angle=90,hjust=1))
PdDistrict is still unchecked.
area_df <- (data.frame(District=train$PdDistrict, Category=train$Category))
area_df
g <- ggplot(area_df, aes(District))
g + geom_bar() + geom_bar(aes(fill=Category)) + ggtitle("Crime by District") + theme(axis.text.x = element_text(angle=90,hjust=1))
Let’s do some text mining as this is by far the most useful information! First, deal with synonyms of the cats
train$Keyword <- NA
library("xlsx")
library("wordnet")
#setDict("/usr/local/Cellar/wordnet/3.1")
#initDict()
old_cats <- tolower(names(sort(table(train$Category), decreasing=F)))
for (cat in old_cats) {
origin_cat <- cat
if (grepl("/", cat)) {
cat <- strsplit(cat, "/")
for (word in cat[[1]]) {
syn_list <- synonyms(word, "NOUN")
pattern <- paste(syn_list, collapse = "|")
train$Keyword[grepl(pattern, train$Descript)] <- origin_cat
}
} else {
cat <- removeWords(cat, stopwords("en"))
cat <- trimws(gsub(" +", " ", cat))
cat <- strsplit(cat, " ")
for (word in cat[[1]]) {
patter <- paste(synonyms(word, "NOUN"), collapse = "|")
train$Keyword[grepl(pattern, train$Descript)] <- origin_cat
}
}
}
Words with top frequency.
# Words in Descript with top frequency
freq_words <- (tolower(freq_words))
remove <- c("FROM", "WITH")
freq_words <- freq_words[!freq_words %in% remove]
for (word in freq_words) {
train$Keyword[grepl(word, train$Descript)] <- word
}
#freq_words
#freq_words_str <- paste(freq_words, collapse = "|")
#freq_words_str
Lastly, perfect matches.
library(tm)
library(stringr)
# Preprocess the categories
train$Descript <- tolower(train$Descript)
old_cats <- tolower(names(sort(table(train$Category), decreasing=F)))
for (cat in old_cats) {
if (grepl("/", cat)) {
pattern <- gsub("/", "|", cat)
#print(cat)
} else {
pattern <- removeWords(cat, stopwords("en"))
pattern <- trimws(gsub(" +", " ", pattern))
}
train$Keyword[grepl(pattern, train$Descript)] <- cat
}
Finally we need to take a look at the resolution…
res_df <- data.frame(Resolution = train$Resolution, Category = train$Category)
g <- ggplot(res_df, aes(Resolution))
g + geom_bar() + geom_bar(aes(fill=Category)) + ggtitle("Crime Resolutions") + theme(axis.text.x = element_text(angle=90,hjust=1)) +
theme(legend.position="right") + theme(legend.text = element_text(size=5))
colnames(train)[which(names(train) == "Keyword")] <- "Keyword"
prop.table(table(is.na(train$Keyword)))
FALSE
1
table(train$Keyword)
aided arrest arson assault
30318 9011 1483 17633
auto automobile battery bribery
82 26897 33791 148
burglary case drug/narcotic embezzlement
38956 2259 11335 197
entry extortion forgery/counterfeiting fraud
1 256 7907 4295
gambling kidnapping larceny/theft license
144 613 187437 885
loitering lost mischief other offenses
1383 38153 10482 42
pornography/obscene mat possession property prostitution
144 34417 16501 7227
recovered vehicle robbery runaway stolen
6796 22507 1946 11226
stolen property suicide suspended suspicious
4292 508 26839 3164
suspicious occ traffic trespass vandalism
22280 22303 6766 34070
vehicle vehicle theft violation warrant
15558 167594 25908 24295
unique(train$Keyword)
[1] "warrant" "traffic" "larceny/theft" "automobile"
[5] "vehicle theft" "vandalism" "property" "robbery"
[9] "assault" "lost" "suspicious" "aided"
[13] "violation" "suspended" "stolen" "burglary"
[17] "recovered vehicle" "forgery/counterfeiting" "possession" "drug/narcotic"
[21] "arrest" "stolen property" "trespass" "fraud"
[25] "battery" "vehicle" "suspicious occ" "runaway"
[29] "prostitution" "mischief" "arson" "pornography/obscene mat"
[33] "case" "license" "kidnapping" "suicide"
[37] "bribery" "loitering" "embezzlement" "extortion"
[41] "gambling" "auto" "other offenses" "entry"
Need to improve…Reduce TRUE values…